from datasets import load_dataset
import random
dataset = load_dataset("larryvrh/ShareGPT-Zh_Only",)['train']
real_data=[] # 存储写入json文件的
data_for_nsp=[] # 用于生成负例的
import json
# print(dataset['conversations'][1])
for d in dataset['conversations']:
    temp=[]

    for i in range(len(d)-1):
        temp.append(d[i]['value'])
    temp.append(d[-1]['value'])
    data_for_nsp.append(temp)

for i,d in enumerate(dataset['conversations']):

    for j in range(len(d)-1):
        real_data.append({'pair1':d[j]['value'],'pair2':d[j+1]['value'],'label':1})
        
        k=random.randint(0,len(data_for_nsp)-1)
        while k==i:
            k=random.randint(0,len(data_for_nsp)-1)
        l=random.randint(0,len(data_for_nsp[k])-1)

        real_data.append({'pair1':d[j]['value'],'pair2':data_for_nsp[k][l],'label':0})

with open('output.jsonl','w',encoding='utf-8') as f:
    for d in real_data:
        json.dump(d,f,ensure_ascii=False)
        f.write('\n')
    # f.writelines(real_data)